/*
Date: October 2025
Project: Income and Child Maltreatment: Evidence from a Discontinuity in Tax Benefits
Author: Katherine Rittenhouse
Purpose: Use ACS data to calculate tax value of Dec Birth. Prepare variables to use in birth record prediction.
Files in: usa_00004; CPI_U_RS; povline
Files out: predictors1yr; universe1yr; fortaxsim; refch1yr; norefch1yr; predictorsplusoutcomeacs1yr; logprediction01212025; predictionscatter_value (Figure D1a); predictionscatter_aftertax (Figure D1b)
*/

clear all
set more off

use "usa_00004",clear

****STEP 1 - Create tax units****

*create tax id by combining household serialnumber, family unit, subfamily unit
*drop group quarters
drop if gq==3|gq==4

tostring subfam famunit serial sample,force replace
gen taxid = serial+"_"+sample+"_"+famunit+"_"+subfam

gen dependent = 0
replace dependent = 1 if subfam=="0" & relate==3 & age<18
replace dependent = 1 if subfam!="0" & sfrelate==3 & age<18
bys taxid: egen numdeps=sum(dependent)

*flag for tax filer 
gen head = 0
replace head = 1 if subfam=="0" & relate==1
replace head = 1 if subfam!="0" & sfrelate==1
bys taxid: egen numheads=sum(head)

*flag for tax filer's spouse 
gen spouse = 0
replace spouse = 1 if subfam=="0" & relate==2
replace spouse = 1 if subfam!="0" & sfrelate==2
bys taxid: egen numspouses=sum(spouse)

*flag for tax filer is married, spouse present 
gen married = 0
replace married = 1 if head==1 & (marst ==1 | marst==2)
bys taxid: egen joint=sum(married)

*flag for tax filer is married, spouse absent /separated)
gen separated = 0
replace separated = 1 if head==1 & marst==3
bys taxid: egen numsep=sum(separated)

*flag for household head is unmarried 
gen unmarried = 0
replace unmarried = 1 if head==1 & (marst>3)
bys taxid: egen single=sum(unmarried)

gen mstat=0
*married couple filing jointly
replace mstat = 2 if joint==1

*single/head of household unmarried 
replace mstat = 1 if single==1 

*single/head of household if separated but living with dependents
replace mstat = 1 if numsep==1 & numdeps>0

gen state = 5
ren numdeps depx 

gen d13 = dependent==1 & age<13
gen d17 = dependent==1 & age<17
bys taxid: egen dep13 = sum(d13)
bys taxid: egen dep17 = sum(d17)

gen dep18 = depx


gen wage1=0
replace wage1=incearn if head==1
gen wage2=0
replace wage2=incearn if spouse==1
bys taxid: egen pwages = max(wage1)
bys taxid: egen swages = max(wage2)
gen wagetot=swages+pwages


gen age1=0
replace age1=age if head==1
gen age2=0
replace age2=age if spouse==1
bys taxid: egen page = max(age1)
bys taxid: egen sage = max(age2)
*parents' age at year of child's birth 
replace page = page-age if page!=0
replace sage=sage-age if sage!=0


****Step 2 - gen reference-child-level variables of interest (observed in births data)****
*reference child is youngest dependent, whose "birth value" I want to calculate
gen referencechild = 0
sort taxid age
bys taxid: replace referencechild=1 if _n==1 
replace referencechild = 0 if dependent==0

*how many siblings does this child have?
gen childorder = 0
replace childorder = nsibs+1 if referencechild==1 

*parents' age at ref child's birth
gen momage=.
gen dadage=.
replace momage = age_mom - age if referencechild==1 & age_mom!=.
replace dadage = age_pop - age if referencechild==1 & age_pop!=.

ren year yearorig
ren birthyr year
*recentered year based on birth quarter
gen recentered_yr= year 
replace recentered_yr = year+1 if birthqtr>2

*parents education
*1 - hs degree or less
*2 - some college +
*9 - missing
gen meduc = .
gen feduc = .

replace meduc = 1 if educ_mom<=6 & referencechild==1
replace meduc = 2 if educ_mom>6 & educ_mom!=. & referencechild==1
replace meduc = 9 if educ_mom ==. & referencechild==1

replace feduc = 1 if educ_pop<=6 & referencechild==1
replace feduc = 2 if educ_pop>6 & educ_mom!=. & referencechild==1
replace feduc = 9 if educ_pop ==. & referencechild==1


*parents' race /ethnicity

*parents' race /ethnicity- 1 white, 2 black, 3 ai, 4 asian/pi, 5 hispanic, 9 missing
gen momrace = rachsing_mom
gen dadrace = rachsing_pop

replace momrace = 9 if momrace==.
replace dadrace = 9 if dadrace==.


*1 - <23; 2 - 23-27; 3 - 28-33; 4 - 34-39; 5 - 40+
gen momagebin = 9
replace momagebin = 1 if momage<23
replace momagebin = 2 if momage>=23 & momage<=27
replace momagebin = 3 if momage>=28 & momage<=33
replace momagebin = 4 if momage>=34 & momage<=39
replace momagebin = 5 if momage>=40 & momage!=.
gen dadagebin = 9
replace dadagebin = 1 if dadage<23
replace dadagebin = 2 if dadage>=23 & dadage<=27
replace dadagebin = 3 if dadage>=28 & dadage<=33
replace dadagebin = 4 if dadage>=34 & dadage<=39
replace dadagebin = 5 if dadage>=40 & dadage!=.


****
****STEP 3 - Create universe; keep reference child****
*create universe - tax units with at least one child under age 5 
gen universe = 0
//taxid with dependent under 5 years

replace universe = 1 if referencechild==1 & age<5

keep if referencechild==1 & universe==1
drop if mstat==0
sort taxid
gen taxsimid=_n 

preserve

keep taxsimid year recentered_yr mstat page sage depx dep13 dep17 dep18 pwages swages momagebin* dadagebin* meduc feduc momrace dadrace age  hhwt perwt puma cpuma childorder wagetot

save "predictors1yr.dta",replace

restore
save "universe1yr.dta",replace


keep taxsimid year state mstat page sage depx dep13 dep17 dep18 pwages swages 

gen dividends=0
gen intrec=0
gen stcg=0
gen ltcg=0
gen otherprop=0
gen nonprop=0
gen pensions=0
gen gssi=0
gen ui=0
gen transfers=0
gen rentpaid=0
gen proptax=0
gen otheritem=0
gen childcare=0
gen mortgage=0
gen scorp=0
gen pbusinc=0
gen pprofinc=0
gen sbusinc=0
gen sprofinc=0
save "fortaxsim.dta",replace
* run through taxsim 
taxsim32,full replace 

shell ren "taxsim_out.dta" "refch1yr.dta"

* run through taxsim with one fewer child 

use "fortaxsim.dta",clear 
replace depx = depx-1
replace dep13=dep13-1
replace dep17=dep17-1
replace dep18=dep18-1
taxsim32,full replace 

shell ren "taxsim_out.dta" "norefch1yr.dta"


use "norefch1yr.dta",clear 
ren * *_noch
ren taxsimid_noch taxsimid
merge 1:1 taxsim using "refch1yr.dta"
gen dollarvalue =  (fiitax_noch+siitax_noch) - (fiitax + siitax)
gen totaltax=fiitax + siitax
gen eitcdiff = v25- v25_noch


keep taxsim eitc dollarvalue totaltax
merge 1:1 taxsim using "predictors1yr.dta"
drop _m
gen aftertax = wagetot-totaltax

save "predictorsplusoutcomeacs1yr",replace

import excel using "CPI_U_RS.xlsx",clear
keep A B
drop if B=="" | B=="CPI-U-RS1 Index               (December 1977 = 100)"
ren (A B) (year CPI)
destring year CPI,force replace

merge 1:m year using "predictorsplusoutcomeacs1yr"
keep if _m==3
drop _m
gen CPI2017 = 361
gen aftertaxreal = aftertax*(CPI2017/CPI)
gen dollarvaluereal = dollarvalue*(CPI2017/CPI)


save "predictorsplusoutcomeacs1yr",replace


*linear prediction 

log using "logprediction01212025",replace
use "predictorsplusoutcomeacs1yr",clear


local fcontrols "meduc feduc childorder momrace dadrace momagebin dadagebin recentered_yr cpuma0010"
local ctrls "i.(`fcontrols')##i.(`fcontrols')"
local fcontrols_1child "meduc feduc momrace dadrace momagebin dadagebin recentered_yr cpuma0010"
local ctrls_1child "i.(`fcontrols_1child')##i.(`fcontrols_1child')"
replace childorder=3 if childorder>3 

splitsample, generate(splitsample) split(.75 .25) rseed(12345)
label define slabel 1 "Training" 2 "Validation"
label values splitsample slabel

regress aftertax `ctrls' [pweight=hhwt] if splitsample==1
estimates store olsaftertax


lasso linear aftertax `ctrls' if splitsample==1
estimates store lassoaftertax

regress dollarvalue `ctrls' [pweight=hhwt] if splitsample==1
estimates store olsdollarvalue

lasso dollarvalue `ctrls' [pweight=hhwt] if splitsample==1
estimates store lassodollarvalue


estimates restore olsaftertax
predict aftertax_hat

estimates restore olsdollarvalue
predict value_hat 

estimates restore lassoaftertax
predict aftertax_hat_las

estimates restore lassodollarvalue
predict value_hat_las

lassogof olsdollarvalue olsaftertax if splitsample==2

twoway (scatter value_hat dollarvalue if splitsample==2 & value_hat>-2000,msize(vtiny) mcolor(gray) xtitle("Actual") ytitle("Predicted")) (lfit value_hat dollarvalue if splitsample==2,lcolor(red) legend(off) note(Out-of-sample r-squared = 0.426)), scheme(s1color)
graph export "predictionscatter_value.png", replace

twoway (scatter aftertax_hat aftertax if aftertax<600000 & splitsample==2,msize(vtiny) mcolor(gray) xtitle("Actual") ytitle("Predicted")) (lfit aftertax_hat aftertax if splitsample==2 & aftertax<600000,lcolor(red) legend(off) note(Out-of-sample r-squared = 0.582)), scheme(s1color)
graph export "predictionscatter_aftertax.png", replace

merge m:1 year using "povline.dta"
keep if _m==3
drop _m 

gen lowinc = aftertaxreal<2*povline 
gen lowinc_hat = aftertax_hat<2*povline

tab lowinc lowinc_hat if splitsample==2

log close
